Dependencies
library(tidyverse)
## ── Attaching packages ───────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.1.0 ✔ purrr 0.2.5
## ✔ tibble 1.4.2 ✔ dplyr 0.7.8
## ✔ tidyr 0.8.2 ✔ stringr 1.3.1
## ✔ readr 1.3.0 ✔ forcats 0.3.0
## ── Conflicts ──────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(dplyr)
library(klaR)
## Loading required package: MASS
##
## Attaching package: 'MASS'
## The following object is masked from 'package:plotly':
##
## select
## The following object is masked from 'package:dplyr':
##
## select
library(ggpubr)
## Loading required package: magrittr
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
##
## set_names
## The following object is masked from 'package:tidyr':
##
## extract
data.tb <- read_csv("./data/BlackFriday.csv")
## Parsed with column specification:
## cols(
## User_ID = col_double(),
## Product_ID = col_character(),
## Gender = col_character(),
## Age = col_character(),
## Occupation = col_double(),
## City_Category = col_character(),
## Stay_In_Current_City_Years = col_character(),
## Marital_Status = col_double(),
## Product_Category_1 = col_double(),
## Product_Category_2 = col_double(),
## Product_Category_3 = col_double(),
## Purchase = col_double()
## )
#data.tb %>% head(25)
data_random.tb <- sample_n(data.tb, 2000, replace=TRUE)
#data_random_unique.tb <- unique(data_random.tb)
#test.tb <- data.tb %>% head(2000)
find unique values for Age
unique(data_random.tb$Age)
## [1] "46-50" "18-25" "26-35" "51-55" "36-45" "55+" "0-17"
functions to filter Ages into return values (numeric representations of the group)
ageFilter <- function(age) {
switch(age, "0-17"=1, "18-25"=2, "26-35"=3, "36-45"=4, "46-50"=5, "51-55"=6, "55+"=7)
}
ageFilter("51-55")
## [1] 6
pipe Age vector into ageFilter(), append to tibble
##data.tb$age <- ageFilter(data.tb$Age)
data_random.tb$age <- data_random.tb$Age
#data.tb %>% head()
for (i in 1:length(data_random.tb$age)) {
data_random.tb$age[i] <- ageFilter(data_random.tb$age[i])
##print(test.tb$Age + " " + test.tb$age)
}
#make numeric
data_random.tb$age <- data_random.tb$age %>% as.numeric()
Regression: Age vs Purchase Value
fit <- lm(formula= data_random.tb$Purchase ~ data_random.tb$age + 0, data=data_random.tb)
fit[1]
## $coefficients
## data_random.tb$age
## 2273.659
Correlation: Age vs Purchase Value
cor(data_random.tb$age %>% as.numeric(), data_random.tb$Purchase %>% as.numeric())
## [1] -0.0177134
anova <- aov(data_random.tb$Purchase ~ data_random.tb$Age, data = data_random.tb)
summary(anova)
## Df Sum Sq Mean Sq F value Pr(>F)
## data_random.tb$Age 6 2.827e+08 47118284 1.917 0.0746 .
## Residuals 1993 4.899e+10 24579562
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
data_random_na.tb <- na.omit(data_random.tb)
plot <- plot_ly(
x = c(data_random_na.tb$Age),
y = c(data_random_na.tb$Purchase),
name = "Age vs. Purchase Amount",
type = "bar"
)
plot
ggboxplot(data_random_na.tb, x = "Age", y = "Purchase",
order = c("0-17", "18-25", "26-35", "36-45", "46-50", "51-55", "55+" ),
ylab = "Money Spent", xlab = "Age")
